Q1
First I load the data files
Next I reshape dat into the handful data format. I use tidyr package for the aim instead of reshape package because the foramer is the improved version of the latter.
install.packages('tidyverse')
URL 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/tidyverse_1.1.1.tgz' を試しています
Content type 'application/x-gzip' length 37228 bytes (36 KB)
==================================================
downloaded 36 KB
The downloaded binary packages are in
/var/folders/1w/nyst5xl16t329h4gpz0j49hm0000gn/T//Rtmp2exVAC/downloaded_packages
library(tidyverse)
Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ----------------------------------------------------------------------------------------------------------------------------------
filter(): dplyr, stats
lag(): dplyr, stats
dat <- dat %>%
tidyr::gather(key=From, value=value, Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District\ of\ Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New\ Hampshire,New\ Jersey,New\ Mexico,New\ York,North\ Carolina,North\ Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode\ Island,South\ Carolina,South\ Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West\ Virginia,Wisconsin,Wyoming)
エラー: 想定外の入力です in:
"dat <- dat %>%
tidyr::gather(key=From, value=value, Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District\"
Delete the puctuations.
Then I add two new columns to the above data frame.
d1 <- merge(dat, states, by.x="From", by.y="States")
d1 <- d1[,c(-5,-6)]
names(d1)[4] <- "ID_From"
d2 <- merge(d1, states, by.x="To", by.y="States")
d2 <- d2[,c(-6,-7)]
names(d2)[5] <- "ID_To"
dat <- d2
dat
Sort the data frame in the ascending order
dat <- arrange(dat, ID_From)
dat <- arrange(dat, ID_To)
dat
Convert the above data frame into the matrix form
Drawing the chord diagram
chorddiag(as.matrix(t5),groupColors=states$Color,showTicks=F,groupnamePadding = 20,groupThickness=.05,groupnameFontsize=10)
row names of the 'data' matrix differ from its column names or the 'groupNames' argument.
Q2
Read “Stops On Lines” and the all GIS data of bus lines.
library(dplyr)
library(sp)
library(rgdal)
library(leaflet)
library(ggmap)
# Bus Stops
SOL <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/BusStops1216","StopsOnLines1216")
SOL.pj <- spTransform(SOL, CRS("+proj=longlat +datum=WGS84"))
# Bus Routes
CC <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/ComCir1216","ComCir1216")
CC.pj <- spTransform(CC, CRS("+proj=longlat +datum=WGS84"))
LE <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LimExp1216","LimExp1216")
LE.pj <- spTransform(LE, CRS("+proj=longlat +datum=WGS84"))
LCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalCBD1216","LocalCBD1216")
LCBD.pj <- spTransform(LCBD, CRS("+proj=longlat +datum=WGS84"))
LNCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalNonCBD1216","LocalNonCBD1216")
LNCBD.pj <- spTransform(LNCBD, CRS("+proj=longlat +datum=WGS84"))
RBRT <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/RapidBRT1216","RapidBRT1216")
RBRT.pj <- spTransform(RBRT, CRS("+proj=longlat +datum=WGS84"))
make Line_list
Line_list <- list()
for (i in 1:6){
for (j in 1:length(tmps[[i]])){
Line_list <- c(Line_list, tmps[[i]][j]@lines[[1]]@Lines)
}
}
エラー: スロット "lines"を、スロットを持たない基本クラス ("list") のオブジェクトから得ようとしました
make new_id
pjs <- list(CC.pj, LE.pj, LCBD.pj, LNCBD.pj, RBRT.pj)
LinLSs <- list()
for (i in 1:5){
LinLSs <- c(LinLSs, sapply(pjs[[i]]@lines, function(x) length(x@Lines)))
}
LinLSs <- LinLSs %>% unlist()
new_id <- sapply(1:length(LinLSs), function(x) paste0(x, "_", seq.int(LinLSs[[x]]))) %>%
unlist()
SLDF <- mapply(function(x, y) Lines(x, ID = y), x = Line_list, y = new_id) %>%
list() %>%
SpatialLines() %>%
SpatialLinesDataFrame(data = DAT)
SpatialLines(.) でエラー:
lines list not exclusively filled with Lines objects
make new lines and LA map
Q3
First I load the data.
library(quantmod)
library(highcharter)
x <- getSymbols("AUD/JPY", src = "oanda", auto.assign = FALSE)
y <- getSymbols("GBP/USD", src = "oanda", auto.assign = FALSE)
Next make Bollinger’s bands for each exchange rate.
x.BBands.ll <- BBands(x)$dn
x.BBands.ul <- BBands(x)$up
x.BBands.m <- BBands(x)$mavg
y.BBands.ll <- BBands(y)$dn
y.BBands.ul <- BBands(y)$up
y.BBands.m <- BBands(y)$mavg
The drawing code is as follows.
hc <- highchart(type="stock") %>%
hc_title(text="Charting Exchange Rates") %>%
hc_subtitle(text = "Data extracted using quantmod package") %>%
hc_yAxis_multiples(
list(top = "0%", height = "50%", offset=0, opposite=TRUE),
list(top = "50%", height = "50%", offset=0, opposite=TRUE)
)%>%
hc_add_series(x, id = "audjpy",name ="audjpy", yAxis=0, color="blue", lineWidth=1.5) %>%
hc_add_series(x.BBands.ll, id = "audjpy.ll", name="audjpy Lower BBands",yAxis=0,
color="black",dashStyle='shortdash', lineWidth=1) %>%
hc_add_series(x.BBands.ul, id = "audjpy.ul", name="audjpy Upper BBands",yAxis=0,
color="black",lineWidth=1) %>%
hc_add_series(x.BBands.m, id = "audjpy.m",name="audjpy BBands MA", yAxis=0,
color="red",lineWidth=1) %>%
hc_add_series(y, id = "gbpusd",name="gbpusd",yAxis=1, color="green", lineWidth=1.5) %>%
hc_add_series(y.BBands.ll, id = "gbpusd.ll",name="gbpusd Lower BBands", yAxis=1,
color="black",dashStyle='shortdash',lineWidth=1) %>%
hc_add_series(y.BBands.ul, id = "gbpusd.ul",name="gbpusd Upper BBands", yAxis=1,
color="black",lineWidth=1) %>%
hc_add_series(y.BBands.m, id = "gbpusd.m",name="gbpusd BBands MA", yAxis=1,
color="red",lineWidth=1) %>%
hc_add_theme(hc_theme_538())
hc
Q4
Load libraries and check the raw data. And make ffdf after converting character columns to factor columns in original df.
library(nycflights13)
library(ffbase)
library(ffbase2)
library(biglm)
library(pROC)
library(chron)
tmp <- flights
tmp$carrier <- as.factor(tmp$carrier)
tmp$tailnum <- as.factor(tmp$tailnum)
tmp$origin <- as.factor(tmp$origin)
tmp$dest <- as.factor(tmp$dest)
flightff <- as.ffdf(tmp)
Next I make new columns as follows
flightff$Delay <- ffifelse(flightff$dep_delay > 0 | flightff$dep_delay == 0 , 1,0)
flightff$DepHour <- flightff$hour
flightff$Car <- ffifelse(flightff$carrier %in% as.factor(c("DL","US","DH","UA")), 1, 0)
flightff$Night <- ffifelse(flightff$hour > 18 | flightff$hour < 6, 1, 0)
flightff$Weekend <- ffifelse(day.of.week(month=flightff$month, day=flightff$day, year=flightff$year) == 6, 1, 0)
I exclude the rows whose Delay values are NA and rename it to logitff. And then I split the dataset into train set and test set.
logitff <- flightff[!is.na(flightff$Delay),]
indx <- ff(1:nrow(logitff))
p <- 0.7
trainIndx <- ff(indx[1:trunc(length(indx)*p)])
trainset <- logitff[trainIndx,]
testIndx <- ff(indx[(trunc(length(indx)*p)+1):length(indx)])
testset <- logitff[testIndx,]
Logistic regression
fit <- bigglm.ffdf(Delay~DepHour+Car+Night+Weekend, data = trainset, family=binomial(), sandwich=TRUE)
summary(fit)
Large data regression model: bigglm(Delay ~ DepHour + Car + Night + Weekend, data = trainset,
family = binomial(), sandwich = TRUE)
Sample size = 229964
Coef (95% CI) SE p
(Intercept) -1.5418 -1.5731 -1.5104 0.0157 0
DepHour 0.1020 0.0996 0.1044 0.0012 0
Car -0.0680 -0.0858 -0.0503 0.0089 0
Night -0.2619 -0.2912 -0.2326 0.0147 0
Weekend -0.1543 -0.1821 -0.1265 0.0139 0
Sandwich (model-robust) standard errors
predict and make confusionmatrix in train_set
train_pred <- predict(fit, newdata = trainset, type="response")
train_pred <- ifelse(train_pred>0.5, 1,0)
train_confusion <- table(as.integer(as.data.frame(trainset)$Delay), as.integer(train_pred))
train_confusion <- addmargins(train_confusion)
train_confusion
0 1 Sum
0 94128 36392 130520
1 56845 42599 99444
Sum 150973 78991 229964
predict and make confusionmatrix in test_set
test_pred <- predict(fit, newdata = testset, type="response")
test_pred <- ifelse(test_pred>0.5, 1,0)
test_confusion <- table(as.integer(as.data.frame(testset)$Delay), as.integer(test_pred))
test_confusion <- addmargins(test_confusion)
test_confusion
0 1 Sum
0 39997 13058 53055
1 25558 19944 45502
Sum 65555 33002 98557
Draw ROC curve
test_pred <- predict(fit, newdata = testset, type="response")
roc <- roc(as.integer(as.data.frame(testset)$Delay), as.numeric(test_pred))
plot(roc)

Q5
First I load the data. And before using spark I delete the irrelevant columns.
Remove the observations satisfying the condition
Split this data into trainset and testset.
test
$test
NA
Use Decision tree
decision_tree <- train %>%
ml_decision_tree(response="BOROUGH", features = c("LATITUDE","LONGITUDE"), max.bins = 200L, max.depth = 10L, seed=123L) %>%
Prediction
table(pred$BOROUGH, pred$prediction)
0 1 2 3 4
BRONX 0 0 0 8876 0
BROOKLYN 21181 0 24 0 1
MANHATTAN 0 17944 0 0 0
QUEENS 25 1 17777 0 0
STATEN ISLAND 23 0 0 0 3169
---
title: "Big Data Analytics Assignment 1"
output: html_notebook
---

## Q1

First I load the data files
```{r}
dat <- read.csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/migration2012.csv")
head(dat)
```

```{r}
states <- read.csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/states_chord.csv")
states
```

Next I reshape dat into the handful data format. I use **tidyr package** for the aim instead of **reshape package** because the foramer is the improved version of the latter.

```{r}
install.packages('tidyverse')
library(tidyverse)
```

```{r}
dat <- dat %>%
tidyr::gather(key=From, value=value, Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District.of.Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New.Hampshire,New.Jersey,New.Mexico,New.York,North.Carolina,North.Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode.Island,South.Carolina,South.Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West.Virginia,Wisconsin,Wyoming)
```

Delete the puctuations.

```{r}
dat <- data.frame(apply(dat, 2, function(y) gsub("[[:punct:]]", " ", y)))
dat
```

Then I add two new columns to the above data frame.

```{r}
d1 <- merge(dat, states, by.x="From", by.y="States")
d1 <- d1[,c(-5,-6)]
names(d1)[4] <- "ID_From"

d2 <- merge(d1, states, by.x="To", by.y="States")
d2 <- d2[,c(-6,-7)]
names(d2)[5] <- "ID_To"
dat <- d2
dat
```

Sort the data frame in the ascending order

```{r}
dat <- arrange(dat, ID_From)
dat <- arrange(dat, ID_To)
dat
```

Convert the above data frame into the matrix form

```{r}
t1 <- dat[,c(1,2,4,5)]
t1 <- t1 %>%
  spread(key=To, value=ID_To)

t2 <- dat[,c(1,2,3,4)]
t2 <- t2 %>%
  spread(key=To, value=value)

t3 <- rbind(t1[1,],t2)
t3$From <- as.character(t3$From)
t3[1,2] <- 0
t3[1,1] <- "ID_To"
t3 <- t3 %>%
  arrange(ID_From)

library(data.table)
setcolorder(t3,c("From","ID_From","Connecticut","Maine","Massachusetts","New Hampshire","Rhode Island",
"Vermont","New Jersey","New York","Pennsylvania","Illinois","Indiana",
"Michigan","Ohio","Wisconsin","Iowa","Kansas","Minnesota",
"Missouri","Nebraska","North Dakota","South Dakota","Delaware","Florida",
"Georgia","Maryland","North Carolina","South Carolina","Virginia","District of Columbia",
"West Virginia","Alabama","Kentucky","Mississippi","Tennessee","Arkansas",
"Louisiana","Oklahoma","Texas","Arizona","Colorado","Idaho",
"Montana","Nevada","New Mexico","Utah","Wyoming","Alaska",
"California","Hawaii","Oregon","Washington"))

t3 <- t3[c(-1),c(-2)]

t4 <- t3[,-1]
rownames(t4) <- t3[,1]

t5 <- data.frame(apply(t4, 2, function(y) as.numeric(y)))
rownames(t5) <- t3[,1]
t5
```

Drawing the chord diagram
```{r}
library(chorddiag)

chorddiag(as.matrix(t5),groupColors=states$Color,showTicks=F,groupnamePadding=20,groupThickness=.05,groupnameFontsize=10)
```


## Q2

Read "Stops On Lines" and the all GIS data of bus lines.
```{r}
library(dplyr)
library(sp)
library(rgdal)
library(leaflet)
library(ggmap)

# Bus Stops
SOL <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/BusStops1216","StopsOnLines1216")
SOL.pj <- spTransform(SOL, CRS("+proj=longlat +datum=WGS84"))

# Bus Routes
CC <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/ComCir1216","ComCir1216")
CC.pj <- spTransform(CC, CRS("+proj=longlat +datum=WGS84"))
LE <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LimExp1216","LimExp1216")
LE.pj <- spTransform(LE, CRS("+proj=longlat +datum=WGS84"))
LCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalCBD1216","LocalCBD1216")
LCBD.pj <- spTransform(LCBD, CRS("+proj=longlat +datum=WGS84"))
LNCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalNonCBD1216","LocalNonCBD1216")
LNCBD.pj <- spTransform(LNCBD, CRS("+proj=longlat +datum=WGS84"))
RBRT <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/RapidBRT1216","RapidBRT1216")
RBRT.pj <- spTransform(RBRT, CRS("+proj=longlat +datum=WGS84"))
```

make Line_list
```{r}
# Individual Bus Route
IND.pj <- list()
layer_list <- ogrListLayers("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/Individuals1216")
for (i in layer_list){
  if (i != 728){
    IND.pj <- c(IND.pj, spTransform(readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/Individuals1216", toString(i)), CRS("+proj=longlat +datum=WGS84")))
  }
}

summary(tmp_IND[[1]])
summary(tmp_CC)


tmp_CC <- geometry(CC.pj)
tmp_LE <- geometry(LE.pj)
tmp_LCBD <- geometry(LCBD.pj)
tmp_LNCBD <- geometry(LNCBD.pj)
tmp_RBRT <- geometry(RBRT.pj)

IND <- c()
for (i in 1:(length(layer_list)-1)){
  IND <- c(IND, tmp_IND[[i]])
}

tmps <- list(tmp_CC, tmp_LE, tmp_LCBD, tmp_LNCBD, tmp_RBRT, IND)

Line_list <- list()
for (i in 1:6){
  for (j in 1:length(tmps[[i]])){
    Line_list <- c(Line_list, tmps[[i]][j]@lines[[1]]@Lines)
  }
}
```


make new_id
```{r}
pjs <- list(CC.pj, LE.pj, LCBD.pj, LNCBD.pj, RBRT.pj)
LinLSs <- list()
for (i in 1:5){
 LinLSs <- c(LinLSs, sapply(pjs[[i]]@lines, function(x) length(x@Lines)))
}
LinLSs <- LinLSs %>% unlist()

new_id <- sapply(1:length(LinLSs), function(x) paste0(x, "_", seq.int(LinLSs[[x]]))) %>% 
  unlist()
```

```{r}
## make a new data.frame (only route_id)
DAT=data.frame(matrix(rep(NA,1),nrow=1))[-1,]
for (i in 1:5){
  df <- data.frame(route_id = pjs[[i]]@data$VAR_IDENT)
  DAT <- rbind(DAT, df)
}
rownames(DAT) <- new_id

SLDF <- mapply(function(x, y) Lines(x, ID = y), x = Line_list, y = new_id) %>%
  #list() %>%
  SpatialLines() %>% 
  SpatialLinesDataFrame(data = DAT)
```


make new lines and LA map
```{r}
dat <- geocode('Los Angels')

leaflet() %>%
  setView(lng = dat['lon'], lat = dat['lat']	, zoom = 11) %>%
  addPolylines(data = SLDF, color = "black", opacity = 1, weight = 1) %>% 
  addCircles(data=SOL.pj@data,~LONG, ~LAT, color = "red", weight = 0.3) %>%
  addTiles()
```


## Q3

First I load the data.
```{r}
library(quantmod)
library(highcharter)

x <- getSymbols("AUD/JPY", src = "oanda", auto.assign = FALSE)
y <- getSymbols("GBP/USD", src = "oanda", auto.assign = FALSE)
```

Next make Bollinger's bands for each exchange rate.
```{r}
x.BBands.ll <- BBands(x)$dn
x.BBands.ul <- BBands(x)$up
x.BBands.m <- BBands(x)$mavg
y.BBands.ll <- BBands(y)$dn
y.BBands.ul <- BBands(y)$up
y.BBands.m <- BBands(y)$mavg
```

The drawing code is as follows.
```{r}
hc <- highchart(type="stock") %>% 
  hc_title(text="Charting Exchange Rates") %>% 
  hc_subtitle(text = "Data extracted using quantmod package") %>% 
  hc_yAxis_multiples(
    list(top = "0%", height = "50%", offset=0, opposite=TRUE),
    list(top = "50%", height = "50%", offset=0, opposite=TRUE)
  )%>%
  hc_add_series(x, id = "audjpy",name ="audjpy", yAxis=0, color="blue", lineWidth=1.5) %>%
  hc_add_series(x.BBands.ll, id = "audjpy.ll", name="audjpy Lower BBands",yAxis=0,
                color="black",dashStyle='shortdash', lineWidth=1) %>%
  hc_add_series(x.BBands.ul, id = "audjpy.ul", name="audjpy Upper BBands",yAxis=0,
                color="black",lineWidth=1) %>%
  hc_add_series(x.BBands.m, id = "audjpy.m",name="audjpy BBands MA", yAxis=0,
                color="red",lineWidth=1) %>%
  hc_add_series(y, id = "gbpusd",name="gbpusd",yAxis=1, color="green", lineWidth=1.5) %>%
  hc_add_series(y.BBands.ll, id = "gbpusd.ll",name="gbpusd Lower BBands", yAxis=1,
                color="black",dashStyle='shortdash',lineWidth=1) %>%
  hc_add_series(y.BBands.ul, id = "gbpusd.ul",name="gbpusd Upper BBands", yAxis=1,
                color="black",lineWidth=1) %>%
  hc_add_series(y.BBands.m, id = "gbpusd.m",name="gbpusd BBands MA", yAxis=1,
                color="red",lineWidth=1) %>%
  hc_add_theme(hc_theme_538())

hc
```


## Q4

Load libraries and check the raw data. And make ffdf after converting character columns to factor columns in original df.
```{r}
library(nycflights13)
library(ffbase)
library(ffbase2)
library(biglm)
library(pROC)
library(chron)

tmp <- flights
tmp$carrier <- as.factor(tmp$carrier)
tmp$tailnum <- as.factor(tmp$tailnum)
tmp$origin <- as.factor(tmp$origin)
tmp$dest <- as.factor(tmp$dest)

flightff <- as.ffdf(tmp)
```

Next I make new columns as follows
```{r}
flightff$Delay <- ffifelse(flightff$dep_delay > 0 | flightff$dep_delay == 0 , 1,0)
flightff$DepHour <- flightff$hour
flightff$Car <- ffifelse(flightff$carrier %in% as.factor(c("DL","US","DH","UA")), 1, 0)
flightff$Night <- ffifelse(flightff$hour > 18 | flightff$hour < 6, 1, 0)
flightff$Weekend <- ffifelse(day.of.week(month=flightff$month, day=flightff$day, year=flightff$year) == 6, 1, 0)
```

I exclude the rows whose Delay values are NA and rename it to logitff.
And then I split the dataset into train set and test set.
```{r}
logitff <- flightff[!is.na(flightff$Delay),]

indx <- ff(1:nrow(logitff))
p <- 0.7
trainIndx <- ff(indx[1:trunc(length(indx)*p)])
trainset <- logitff[trainIndx,]
testIndx <- ff(indx[(trunc(length(indx)*p)+1):length(indx)])
testset <- logitff[testIndx,]
```

Logistic regression 
```{r}
fit <- bigglm.ffdf(Delay~DepHour+Car+Night+Weekend, data = trainset, family=binomial(), sandwich=TRUE)
summary(fit)
```

predict and make confusionmatrix in train_set
```{r}
train_pred <- predict(fit, newdata = trainset, type="response")
train_pred <- ifelse(train_pred>0.5, 1,0)
train_confusion <- table(as.integer(as.data.frame(trainset)$Delay), as.integer(train_pred))
train_confusion <- addmargins(train_confusion)
train_confusion
```

predict and make confusionmatrix in test_set
```{r}
test_pred <- predict(fit, newdata = testset, type="response")
test_pred <- ifelse(test_pred>0.5, 1,0)
test_confusion <- table(as.integer(as.data.frame(testset)$Delay), as.integer(test_pred))
test_confusion <- addmargins(test_confusion)
test_confusion
```

Draw ROC curve
```{r}
test_pred <- predict(fit, newdata = testset, type="response")
roc <- roc(as.integer(as.data.frame(testset)$Delay), as.numeric(test_pred))
plot(roc)
```


## Q5

First I load the data. And before using spark I delete the irrelevant columns.
```{r}
library(sparklyr)
library(dplyr)
library(readr)
sc <- spark_connect(master = "local")

# 元のcsvに問題があるっぽい？ → Unspecifiedが怒られてるっぽい　→　Unspecifiedを0に変えるか。→今度はPASSENGER VEHICKEが怒られる。
# 要するにStringが全部ダメっぽい。でもtitanicではStringも上手く渡せてる。なぜ？
# 二つの例から空白は別に問題ではないことがわかる。
dat <- read_csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/NYPD_Motor_Vehicle_Collisions.csv")
dat <- dat[,c("BOROUGH","LATITUDE","LONGITUDE","UNIQUE KEY")]
nypd_tbl <- copy_to(sc, dat, "nypd_tbl",overwrite = TRUE)
```

Remove the observations satisfying the condition
```{r}
nypd_tbl <- nypd_tbl %>%
  filter(BOROUGH!="",!is.na(LATITUDE),!is.na(LONGITUDE),LATITUDE!=0,LONGITUDE!=0)
nypd_tbl
```

Split this data into trainset and testset.
```{r}
partitions <- nypd_tbl %>%
  sdf_partition(training = 0.9, test = 0.1, seed = 123)
train <- partitions[1]$training
test <- partitions[2]$test
```

Use Decision tree
```{r}
decision_tree <- train %>%
  ml_decision_tree(response="BOROUGH", features = c("LATITUDE","LONGITUDE"), max.bins = 200L, max.depth = 10L, seed=123L) %>%
```

Prediction
```{r}
pred <- sdf_predict(decision_tree, test) %>%
  collect

table(pred$BOROUGH, pred$prediction)
```

































